Loading files

setwd("/Users/dutlu42p/repos/mahuika/bully_gbs") 
require("pcadapt")
## Warning: package 'pcadapt' was built under R version 3.5.2
data <- read.pcadapt("output_files/populations.snps.vcf", type = "vcf") # New dataset
## No variant got discarded.
## Summary:
## 
##  - input file:               output_files/populations.snps.vcf
##  - output file:              /var/folders/1g/hdrjtwrj77b1g8ll485bl2sw0000gq/T//RtmpXNdEKG/file375e6e57b638.pcadapt
## 
##  - number of individuals detected:   94
##  - number of loci detected:      9605
## 
## 9605 lines detected.
## 94 columns detected.
metadata<-read.table("metadata/metadata_clean.txt",h=T)
metadata<-cbind(rownames(metadata),metadata)[1:5]
colnames(metadata)<-c("sample","lake","depth","group","sex")
head(metadata)
##          sample   lake depth   group sex
## WK16-02 WK16-02 Wanaka    90    Deep   m
## WK16-16 WK16-16 Wanaka    90    Deep   m
## WK16-33 WK16-33 Wanaka    60    Deep   f
## WK16-66 WK16-66 Wanaka    15 Shallow   f
## WK16-80 WK16-80 Wanaka     5 Shallow   f
## WK16-97 WK16-97 Wanaka     5 Shallow   m

Now that we have all the data loaded in, let’s have a quick look at the general population structure. We will make a PCA, colored by lake

x <- pcadapt(input = data, K = 20) 
rownames(x$scores)<-metadata[,1]
colnames(x$scores)<-paste("PC",1:20,sep="")
write.table(x$scores,"output_files/scores_pca.txt",sep="\t")
plot(x, option = "screeplot")

plot(x, option = "scores", pop=metadata$lake) 

Coloring by depth:

plot(x, option = "scores", pop=metadata$group) 

Theere seems to be much more variation in Lake wanaka, let’s therefore take them as two different datasets to investigate a bit more clearly what is happening.

Interestingly, it looks like the deep ones are closer to the lake wanaka ones than the shallow ones

Again, I split the vcf outside of R.

cut -f 1 metadata/metadata_clean.txt | grep   -E "WK[-0-9.]+" > wanaka_inds.txt
vcftools --vcf output_files/populations.snps.vcf --keep   wanaka_inds.txt --recode 
mv out.recode.vcf  output_files/wanaka.snps.vcf
vcftools --vcf output_files/populations.snps.vcf --remove   wanaka_inds.txt --recode 
mv out.recode.vcf  output_files/wakatipu.snps.vcf

Let’s look at the lake-depth structure now:

#wakatipu
data_wakatipu<- read.pcadapt("output_files/wakatipu.snps.vcf", type = "vcf") 
## No variant got discarded.
## Summary:
## 
##  - input file:               output_files/wakatipu.snps.vcf
##  - output file:              /var/folders/1g/hdrjtwrj77b1g8ll485bl2sw0000gq/T//RtmpXNdEKG/file375e6f225645.pcadapt
## 
##  - number of individuals detected:   45
##  - number of loci detected:      9605
## 
## 9605 lines detected.
## 45 columns detected.
metadata_wakatipu<-metadata[which(metadata$lake=="Wakatipu"),] # that is the way we found them, we remove them too
dim(metadata_wakatipu)
## [1] 45  5
x_wakatipu <- pcadapt(input = data_wakatipu, K = 20) 

#To plot depth, let's make a color palette

colpal <- colorRampPalette(c("grey", "black"))
colors<-colpal(length(levels(as.factor(metadata_wakatipu$depth))))

plot(x_wakatipu, option = "scores", pop=metadata_wakatipu$depth,col=colors) #Wanaka

plot(x_wakatipu, option = "scores", pop=metadata_wakatipu$group) #Wanaka

#Wanaka
data_wanaka <- read.pcadapt("output_files/wanaka.snps.vcf", type = "vcf") 
## No variant got discarded.
## Summary:
## 
##  - input file:               output_files/wanaka.snps.vcf
##  - output file:              /var/folders/1g/hdrjtwrj77b1g8ll485bl2sw0000gq/T//RtmpXNdEKG/file375e374e0503.pcadapt
## 
##  - number of individuals detected:   49
##  - number of loci detected:      9605
## 
## 9605 lines detected.
## 49 columns detected.
metadata_wanaka<-metadata[which(metadata$lake=="Wanaka"),] # that is the way we found them, we remove them too
dim(metadata_wanaka)
## [1] 49  5
x_wanaka <- pcadapt(input = data_wanaka, K = 20) 
plot(x_wanaka, option = "scores", pop=metadata_wanaka$depth,col=colors) #Wanaka

plot(x_wanaka, option = "scores", pop=metadata_wanaka$group) #Wanaka

There might be a tiny bit of depth structure, but almost nothing.

Open questions: Was the structure we saw before associated to the relatedness of the deep fish of Wakatipu to Wanaka? Is it a visual artifact?

Let’s do some structure analysis and some Fst based trees

Structure analsyis

I run it on Wanaka, Wakatipu, and on both pops together but without outliers. The conversion from vcf to faststructure input files is done using PGDspider2.1.1.5


##convert to faststr files

mkdir faststructure plots
conda activate faststr # speciic to the way faststructure  is installed on my computer
structure.py -K 2  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 3  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 4  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 5  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 6  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 7  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 8  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 9  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
structure.py -K 10  --input=output_files/populations.snps--output=faststructure/populations.snps --format=str
#wakatipu
structure.py -K 2  --input=output_files/wakatipu.snps --output=faststructure/wakatipu.snps  --format=str
structure.py -K 3  --input=output_files/wakatipu.snps --output=faststructure/wakatipu.snps  --format=str
structure.py -K 4  --input=output_files/wakatipu.snps --output=faststructure/wakatipu.snps  --format=str
structure.py -K 5  --input=output_files/wakatipu.snps --output=faststructure/wakatipu.snps  --format=str
#wanaka
structure.py -K 2  --input=output_files/wanaka.snps --output=faststructure/wanaka.snps  --format=str
structure.py -K 3 --input=output_files/wanaka.snps --output=faststructure/wanaka.snps  --format=str
structure.py -K 4  --input=output_files/wanaka.snps --output=faststructure/wanaka.snps  --format=str
structure.py -K 5  --input=output_files/wanaka.snps --output=faststructure/wanaka.snps  --format=str

chooseK.py --input=faststructure/wanaka.snps
#Model complexity that maximizes marginal likelihood = 2
#Model components used to explain structure in data = 1
chooseK.py --input=faststructure/wakatipu.snps
#Model complexity that maximizes marginal likelihood = 2
#Model components used to explain structure in data = 1
chooseK.py --input=faststructure/populations.snps.nooutliers.NOMISSING
#Model complexity that maximizes marginal likelihood = 2
#Model components used to explain structure in data = 2

The choose K function suggests that only the structure between lake is clearly visible, let’s see how it looks like in practice:

  library("pophelper")
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
## pophelper v2.2.9 ready.
  for (pop in c("wanaka.snps","wakatipu.snps")){
    print(pop)
      ffiles <- list.files(path="faststructure/",pattern="meanQ")[grep(paste("^",pop,sep=""),perl=T,list.files(path="faststructure/",pattern= "meanQ"))]
      flist <- readQ(files=paste("faststructure/",ffiles,sep=""))
      ###Reading 
      if (pop == "populations.snps") {indcodes <- metadata[,1]}
      if (pop == "wanaka.snps") {indcodes <- metadata_wanaka[,1]}
      if (pop == "wakatipu.snps") {indcodes <- metadata_wakatipu[,1]}
      for (i in 1:length(flist)){
          rownames(flist[[i]]) <-indcodes
      }
  plotQ(flist,imgoutput="join",showindlab=T,useindlab=T,height=7,width=70,grplabangle=0,exportpath=paste("plots/",pop,sep=""),ordergrp=T)
  }
## [1] "wanaka.snps"
## Drawing plot ...
## plots/wanaka.snpsJoined4Files-20191210122520.png exported.
## [1] "wakatipu.snps"
## Drawing plot ...
## plots/wakatipu.snpsJoined4Files-20191210122522.png exported.

As these output 3 png files directly, I link them below showing no Structure whatsoever except for the lake specific one. I was going to order things but as the result is a bit trivial and the ordering is not I have not done it for now, will do if we want to resent it!

Wanaka

plots/wanaka.snpsJoined4Files-20191029153842.png

plots/wanaka.snpsJoined4Files-20191029153842.png

Wakatipu

plots/wakatipu.snpsJoined4Files-20191029153845.png

plots/wakatipu.snpsJoined4Files-20191029153845.png

That looks Ok, but we might reorder them by pop and by depth to look at both lakes together

Reordering

#determine proper order numerically from old order
# to save time I got t
new_order_of_the_94_samples <- c(7,19,29,30,41,42,53,54,65,66,77,89,8,9,20,21,31,43,55,67,78,79,90,91,10,22,32,33,44,45,56,57,68,69,80,92,11,23,81,93,34,46,58,70,82,5,6,17,18,28,40,52,64,76,88,4,16,27,39,51,63,74,75,86,87,15,26,38,49,50,61,62,73,85,3,14,25,36,37,48,60,72,84,94,1,2,12,13,24,35,47,59,71,83)

#find files
filestoread<-paste("faststructure/", list.files(path="faststructure/",pattern="meanQ")[grep(paste("^populations.snps",sep=""),perl=T,list.files(path="faststructure/",pattern= "meanQ"))],sep="")
#reorder every single file according to proper order
for (filename in filestoread){
  print(filename)
  tempdata<-read.table(filename,h=F)
  tempdata<-tempdata[new_order_of_the_94_samples,]
  newfilename = paste(strsplit(filename,".populations")[[1]][1],"/","reordered",strsplit(filename,".populations")[[1]][2],sep="")
  write.table(tempdata,newfilename,quote=F,col.names=F,row.names=F,sep="\t")
}
## [1] "faststructure/populations.snps.10.meanQ"
## [1] "faststructure/populations.snps.2.meanQ"
## [1] "faststructure/populations.snps.3.meanQ"
## [1] "faststructure/populations.snps.4.meanQ"
## [1] "faststructure/populations.snps.5.meanQ"
## [1] "faststructure/populations.snps.6.meanQ"
## [1] "faststructure/populations.snps.7.meanQ"
## [1] "faststructure/populations.snps.8.meanQ"
## [1] "faststructure/populations.snps.9.meanQ"
#play with the metadata and so we have plots with sample names or depth on it
indcodes_v2<-paste(metadata[new_order_of_the_94_samples,1],metadata[new_order_of_the_94_samples,2],metadata[new_order_of_the_94_samples,3],"m",sep="_")

#plot

  ffiles <- list.files(path="faststructure/",pattern="meanQ")[grep(paste("reor",sep=""),perl=T,list.files(path="faststructure/",pattern= "meanQ"))]
      flist <- readQ(files=paste("faststructure/",ffiles,sep=""))

#indcodes_v1

#indcodes_v2
      for (i in 1:length(flist)){
          rownames(flist[[i]]) <-indcodes_v2
}
  plotQ(flist,imgoutput="join",showindlab=T,useindlab=T,height=7,width=70,grplabangle=0,exportpath=paste("plots/reordered.snps",sep=""),ordergrp=T)
## Drawing plot ...
## plots/reordered.snpsJoined9Files-20191210122525.png exported.
#K=2 alone
  plotQ(flist[2],imgoutput ="sep", showindlab=T,useindlab=T ,height=7,width=35,grplabangle=0,exportpath="plots/K2", rainbow(2),showdiv=TRUE,clustercol=c("red","blue"),divcol="white",divtype=1,divsize=1,sortind=NA,grplab=NA)
## Drawing plot ...
## plots/K2reordered.snps.2.png exported.

K=2 to K=10 plots/reordered.snpsJoined9Files-20191029153922.png

K=2 alone

plots/K2reordered.snps.2.png

plots/K2reordered.snps.2.png

Finally get some basic population stats (Fst, Ho)

require("hierfstat")
## Loading required package: hierfstat
require("vcfR")
## Loading required package: vcfR
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.8.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
require("poppr")
## Loading required package: poppr
## Warning: package 'poppr' was built under R version 3.5.2
## Loading required package: adegenet
## Loading required package: ade4
## 
##    /// adegenet 2.1.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
## 
## Attaching package: 'adegenet'
## The following object is masked from 'package:hierfstat':
## 
##     read.fstat
## This is poppr version 2.8.3. To get started, type package?poppr
## OMP parallel support: available
# a bit of play around wqith convewrsion to get a vcf into hierfstat
dataVCF<-vcf <- read.vcfR("output_files/populations.snps.vcf", verbose = T)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 9605
##   column count: 103
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 9605
##   Character matrix gt cols: 103
##   skip: 0
##   nrows: 9605
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9605
## All variants processed
geninddata<-vcfR2genind(dataVCF)
geninddata@pop<-(metadata[,2])
#metadata[,1] == rownames(geninddata$tab)

dirs, tlet’s get some basic stats per lake, per

hierfstatdataall <-genind2hierfstat(geninddata,pop=paste(metadata[,2],metadata[,4],sep=""))
hierfstatdata2lakes <-genind2hierfstat(geninddata,pop=metadata[,2])

sep_pop <- seppop(geninddata)
hierfstatdatawanaka<-genind2hierfstat(sep_pop$Wanaka,pop=rep(1,49))
hierfstatdatawakatipu<-genind2hierfstat(sep_pop$Wakatipu,pop=rep(1,45))

#Overall
basic.stats(hierfstatdataall)$overall
##      Ho      Hs      Ht     Dst     Htp    Dstp     Fst    Fstp     Fis 
##  0.1281  0.1272  0.1354  0.0082  0.1381  0.0109  0.0604  0.0790 -0.0071 
##    Dest 
##  0.0125
#Wanaka
basic.stats(hierfstatdatawanaka)$overall
##     Ho     Hs     Ht    Dst    Htp   Dstp    Fst   Fstp    Fis   Dest 
## 0.1459 0.1468 0.1468 0.0000    NaN    NaN 0.0000    NaN 0.0059    NaN
#Wakatipu
basic.stats(hierfstatdatawakatipu)$overall
##      Ho      Hs      Ht     Dst     Htp    Dstp     Fst    Fstp     Fis 
##  0.1088  0.1071  0.1071  0.0000     NaN     NaN  0.0000     NaN -0.0161 
##    Dest 
##     NaN

One can see from this that Wanaka is more variable than Wakatipu!

genet.dist(hierfstatdata2lakes,method="WC84")
matrix_4_depth_fst<-as.matrix(genet.dist(hierfstatdataall,,method="WC84"))
colnames(matrix_4_depth_fst)<-levels(hierfstatdataall$pop)
rownames(matrix_4_depth_fst)<-levels(hierfstatdataall$pop)
matrix_4_depth_fst
##                  WakatipuDeep WakatipuShallow  WanakaDeep WanakaShallow
## WakatipuDeep     0.0000000000   -0.0002000228 0.097651865   0.106026741
## WakatipuShallow -0.0002000228    0.0000000000 0.116983828   0.126122435
## WanakaDeep       0.0976518655    0.1169838285 0.000000000   0.001426047
## WanakaShallow    0.1060267412    0.1261224346 0.001426047   0.000000000
#test significance of fst between lakes
test<-boot.ppfst(hierfstatdata2lakes)
test$ul # upper limit of CI
##          Wakatipu    Wanaka
## Wakatipu       NA 0.1218122
## Wanaka         NA        NA
test$ll  # lower limit of CI
##          Wakatipu    Wanaka
## Wakatipu       NA 0.1109339
## Wanaka         NA        NA
#test for pops
test_matrix_4_depth_fst<-boot.ppfst(hierfstatdataall)
test_matrix_4_depth_fst$ul # upper limit of CI
##                 WakatipuDeep WakatipuShallow WanakaDeep WanakaShallow
## WakatipuDeep              NA     0.001513352  0.1028158   0.111239290
## WakatipuShallow           NA              NA  0.1221096   0.131155778
## WanakaDeep                NA              NA         NA   0.002035966
## WanakaShallow             NA              NA         NA            NA
test_matrix_4_depth_fst$ll
##                 WakatipuDeep WakatipuShallow WanakaDeep WanakaShallow
## WakatipuDeep              NA    -0.002201314 0.09187002  0.1006238203
## WakatipuShallow           NA              NA 0.11092880  0.1198988330
## WanakaDeep                NA              NA         NA  0.0006330164
## WanakaShallow             NA              NA         NA            NA